<!DOCTYPE html>
<html lang=zh>
<head>
  <meta charset="utf-8">
  
  <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, minimum-scale=1, user-scalable=no, minimal-ui">
  <meta name="renderer" content="webkit">
  <meta http-equiv="Cache-Control" content="no-transform" />
  <meta http-equiv="Cache-Control" content="no-siteapp" />
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta name="apple-mobile-web-app-status-bar-style" content="black">
  <meta name="format-detection" content="telephone=no,email=no,adress=no">
  <!-- Color theme for statusbar -->
  <meta name="theme-color" content="#000000" />
  <!-- 强制页面在当前窗口以独立页面显示,防止别人在框架里调用页面 -->
  <meta http-equiv="window-target" content="_top" />
  
  
  <title>谷歌学术爬虫 | chadqiu&#39;s blog</title>
  <meta name="description" content="一个针对谷歌学术(Google scholar)的爬虫，需要科学上网。支持根据关键词搜索相关的论文前N篇论文，获取论文的主要信息信息。 根据关键词搜索 可以指定：关键词、开始时间、结束时间、返回论文的数量（建议不超过200，否则容易被封），爬取的结果包括： [论文标题, 引用数, 发表时间及机构缩写, 论文链接]，见上图划线的部分结果会print出来，同时也会自动保存到一个excel文件 1234">
<meta property="og:type" content="article">
<meta property="og:title" content="谷歌学术爬虫">
<meta property="og:url" content="http://chadqiu.github.io/083653212f47.html">
<meta property="og:site_name" content="chadqiu">
<meta property="og:description" content="一个针对谷歌学术(Google scholar)的爬虫，需要科学上网。支持根据关键词搜索相关的论文前N篇论文，获取论文的主要信息信息。 根据关键词搜索 可以指定：关键词、开始时间、结束时间、返回论文的数量（建议不超过200，否则容易被封），爬取的结果包括： [论文标题, 引用数, 发表时间及机构缩写, 论文链接]，见上图划线的部分结果会print出来，同时也会自动保存到一个excel文件 1234">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="https://img-blog.csdnimg.cn/c916f42f39694b9eb19c634861784498.png">
<meta property="og:image" content="http://chadqiu.github.io/images/paper%20list.png">
<meta property="article:published_time" content="2023-01-27T09:27:35.000Z">
<meta property="article:modified_time" content="2024-11-26T16:26:22.004Z">
<meta property="article:author" content="John Doe">
<meta property="article:tag" content="python">
<meta property="article:tag" content="爬虫">
<meta property="article:tag" content="谷歌学术">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://img-blog.csdnimg.cn/c916f42f39694b9eb19c634861784498.png">
  <!-- Canonical links -->
  <link rel="canonical" href="http://chadqiu.github.io/083653212f47.html">
  
    <link rel="alternate" href="/atom.xml" title="chadqiu" type="application/atom+xml">
  
  
    <link rel="icon" href="images/avatar.jpg" type="image/x-icon">
  
  
<link rel="stylesheet" href="/css/style.css">

  
  
  
  
<meta name="generator" content="Hexo 6.3.0"></head>


<body class="main-center theme-green" itemscope itemtype="http://schema.org/WebPage">
  <header class="header" itemscope itemtype="http://schema.org/WPHeader">
  <div class="slimContent">
    <div class="navbar-header">
      
      
      <div class="profile-block text-center">
        <a id="avatar" href="https://github.com/chadqiu" target="_blank">
          <img class="img-circle img-rotate" src="/images/logo.jpg" width="200" height="200">
        </a>
        <h2 id="name" class="hidden-xs hidden-sm">chadqiu</h2>
        <h3 id="title" class="hidden-xs hidden-sm hidden-md">Developer &amp; Researcher</h3>
        <small id="location" class="text-muted hidden-xs hidden-sm"><i class="icon icon-map-marker"></i> Shanghai, China</small>
      </div>
      
      <div class="search" id="search-form-wrap">

    <form class="search-form sidebar-form">
        <div class="input-group">
            <input type="text" class="search-form-input form-control" placeholder="搜索" />
            <span class="input-group-btn">
                <button type="submit" class="search-form-submit btn btn-flat" onclick="return false;"><i class="icon icon-search"></i></button>
            </span>
        </div>
    </form>
    <div class="ins-search">
  <div class="ins-search-mask"></div>
  <div class="ins-search-container">
    <div class="ins-input-wrapper">
      <input type="text" class="ins-search-input" placeholder="想要查找什么..." x-webkit-speech />
      <button type="button" class="close ins-close ins-selectable" data-dismiss="modal" aria-label="Close"><span aria-hidden="true">×</span></button>
    </div>
    <div class="ins-section-wrapper">
      <div class="ins-section-container"></div>
    </div>
  </div>
</div>


</div>
      <button class="navbar-toggle collapsed" type="button" data-toggle="collapse" data-target="#main-navbar" aria-controls="main-navbar" aria-expanded="false">
        <span class="sr-only">Toggle navigation</span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
    </div>
    <nav id="main-navbar" class="collapse navbar-collapse" itemscope itemtype="http://schema.org/SiteNavigationElement" role="navigation">
      <ul class="nav navbar-nav main-nav ">
        
        
        <li class="menu-item menu-item-home">
          <a href="/.">
            
            <i class="icon icon-home-fill"></i>
            
            <span class="menu-title">首页</span>
          </a>
        </li>
        
        
        <li class="menu-item menu-item-archives">
          <a href="/archives">
            
            <i class="icon icon-archives-fill"></i>
            
            <span class="menu-title">归档</span>
          </a>
        </li>
        
        
        <li class="menu-item menu-item-categories">
          <a href="/categories">
            
            <i class="icon icon-folder"></i>
            
            <span class="menu-title">分类</span>
          </a>
        </li>
        
        
        <li class="menu-item menu-item-tags">
          <a href="/tags">
            
            <i class="icon icon-tags"></i>
            
            <span class="menu-title">标签</span>
          </a>
        </li>
        
        
        <li class="menu-item menu-item-repository">
          <a href="/repository">
            
            <i class="icon icon-project"></i>
            
            <span class="menu-title">项目</span>
          </a>
        </li>
        
        
        <li class="menu-item menu-item-links">
          <a href="/links">
            
            <i class="icon icon-friendship"></i>
            
            <span class="menu-title">友链</span>
          </a>
        </li>
        
        
        <li class="menu-item menu-item-about">
          <a href="/about">
            
            <i class="icon icon-cup-fill"></i>
            
            <span class="menu-title">关于</span>
          </a>
        </li>
        
      </ul>
      
	
    <ul class="social-links">
    	
        <li><a href="https://github.com/chadqiu" target="_blank" title="Github" data-toggle=tooltip data-placement=top><i class="icon icon-github"></i></a></li>
        
        <li><a href="/atom.xml" target="_blank" title="Rss" data-toggle=tooltip data-placement=top><i class="icon icon-rss"></i></a></li>
        
    </ul>

    </nav>
  </div>
</header>

  
    <aside class="sidebar" itemscope itemtype="http://schema.org/WPSideBar">
  <div class="slimContent">
    
      <div class="widget">
    <h3 class="widget-title">公告</h3>
    <div class="widget-body">
        <div id="board">
            <div class="content">
                <p>欢迎交流与分享经验!</p>
            </div>
        </div>
    </div>
</div>

    
      
  <div class="widget">
    <h3 class="widget-title">分类</h3>
    <div class="widget-body">
      <ul class="category-list"><li class="category-list-item"><a class="category-list-link" href="/categories/AI/">AI</a><span class="category-list-count">3</span></li><li class="category-list-item"><a class="category-list-link" href="/categories/tools-%E7%88%AC%E8%99%AB/">tools - 爬虫</a><span class="category-list-count">3</span></li><li class="category-list-item"><a class="category-list-link" href="/categories/%E8%AE%A1%E7%AE%97%E6%9C%BA%E5%9F%BA%E7%A1%80-%E6%95%B0%E6%8D%AE%E7%BB%93%E6%9E%84%E4%B8%8E%E7%AE%97%E6%B3%95/">计算机基础 - 数据结构与算法</a><span class="category-list-count">1</span></li></ul>
    </div>
  </div>


    
      
  <div class="widget">
    <h3 class="widget-title">标签</h3>
    <div class="widget-body">
      <ul class="tag-list" itemprop="keywords"><li class="tag-list-item"><a class="tag-list-link" href="/tags/AI/" rel="tag">AI</a><span class="tag-list-count">2</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/Java/" rel="tag">Java</a><span class="tag-list-count">1</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/NLP/" rel="tag">NLP</a><span class="tag-list-count">2</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/dataset/" rel="tag">dataset</a><span class="tag-list-count">1</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/huggingface/" rel="tag">huggingface</a><span class="tag-list-count">3</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/python/" rel="tag">python</a><span class="tag-list-count">6</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/train/" rel="tag">train</a><span class="tag-list-count">1</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/%E4%BA%8C%E5%88%86%E6%90%9C%E7%B4%A2/" rel="tag">二分搜索</a><span class="tag-list-count">1</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/%E7%88%AC%E8%99%AB/" rel="tag">爬虫</a><span class="tag-list-count">3</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/%E7%89%9B%E5%AE%A2%E7%BD%91/" rel="tag">牛客网</a><span class="tag-list-count">1</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/%E7%AE%97%E6%B3%95/" rel="tag">算法</a><span class="tag-list-count">1</span></li><li class="tag-list-item"><a class="tag-list-link" href="/tags/%E8%B0%B7%E6%AD%8C%E5%AD%A6%E6%9C%AF/" rel="tag">谷歌学术</a><span class="tag-list-count">1</span></li></ul>
    </div>
  </div>


    
      
  <div class="widget">
    <h3 class="widget-title">标签云</h3>
    <div class="widget-body tagcloud">
      <a href="/tags/AI/" style="font-size: 13.33px;">AI</a> <a href="/tags/Java/" style="font-size: 13px;">Java</a> <a href="/tags/NLP/" style="font-size: 13.33px;">NLP</a> <a href="/tags/dataset/" style="font-size: 13px;">dataset</a> <a href="/tags/huggingface/" style="font-size: 13.67px;">huggingface</a> <a href="/tags/python/" style="font-size: 14px;">python</a> <a href="/tags/train/" style="font-size: 13px;">train</a> <a href="/tags/%E4%BA%8C%E5%88%86%E6%90%9C%E7%B4%A2/" style="font-size: 13px;">二分搜索</a> <a href="/tags/%E7%88%AC%E8%99%AB/" style="font-size: 13.67px;">爬虫</a> <a href="/tags/%E7%89%9B%E5%AE%A2%E7%BD%91/" style="font-size: 13px;">牛客网</a> <a href="/tags/%E7%AE%97%E6%B3%95/" style="font-size: 13px;">算法</a> <a href="/tags/%E8%B0%B7%E6%AD%8C%E5%AD%A6%E6%9C%AF/" style="font-size: 13px;">谷歌学术</a>
    </div>
  </div>

    
      
  <div class="widget">
    <h3 class="widget-title">归档</h3>
    <div class="widget-body">
      <ul class="archive-list"><li class="archive-list-item"><a class="archive-list-link" href="/archives/2025/03/">三月 2025</a><span class="archive-list-count">1</span></li><li class="archive-list-item"><a class="archive-list-link" href="/archives/2025/02/">二月 2025</a><span class="archive-list-count">2</span></li><li class="archive-list-item"><a class="archive-list-link" href="/archives/2025/01/">一月 2025</a><span class="archive-list-count">5</span></li><li class="archive-list-item"><a class="archive-list-link" href="/archives/2024/12/">十二月 2024</a><span class="archive-list-count">5</span></li><li class="archive-list-item"><a class="archive-list-link" href="/archives/2024/11/">十一月 2024</a><span class="archive-list-count">1</span></li><li class="archive-list-item"><a class="archive-list-link" href="/archives/2023/02/">二月 2023</a><span class="archive-list-count">3</span></li><li class="archive-list-item"><a class="archive-list-link" href="/archives/2023/01/">一月 2023</a><span class="archive-list-count">5</span></li></ul>
    </div>
  </div>


    
      
  <div class="widget">
    <h3 class="widget-title">最新文章</h3>
    <div class="widget-body">
      <ul class="recent-post-list list-unstyled no-thumbnail">
        
          <li>
            
            <div class="item-inner">
              <p class="item-category">
                
              </p>
              <p class="item-title">
                <a href="/853c4069a45a.html" class="title">Z哥25-03-02-现在到底是个什么情况？</a>
              </p>
              <p class="item-date">
                <time datetime="2025-03-03T00:40:27.000Z" itemprop="datePublished">2025-03-03</time>
              </p>
            </div>
          </li>
          
          <li>
            
            <div class="item-inner">
              <p class="item-category">
                
              </p>
              <p class="item-title">
                <a href="/44a02643ff6a.html" class="title">Z哥25-02-16直播</a>
              </p>
              <p class="item-date">
                <time datetime="2025-02-17T01:05:09.000Z" itemprop="datePublished">2025-02-17</time>
              </p>
            </div>
          </li>
          
          <li>
            
            <div class="item-inner">
              <p class="item-category">
                
              </p>
              <p class="item-title">
                <a href="/d8613ad6db3e.html" class="title">Z哥25-02-09直播</a>
              </p>
              <p class="item-date">
                <time datetime="2025-02-10T00:59:45.000Z" itemprop="datePublished">2025-02-10</time>
              </p>
            </div>
          </li>
          
          <li>
            
            <div class="item-inner">
              <p class="item-category">
                
              </p>
              <p class="item-title">
                <a href="/814464067ab6.html" class="title">Z哥25-01-27直播</a>
              </p>
              <p class="item-date">
                <time datetime="2025-01-28T14:24:20.000Z" itemprop="datePublished">2025-01-28</time>
              </p>
            </div>
          </li>
          
          <li>
            
            <div class="item-inner">
              <p class="item-category">
                
              </p>
              <p class="item-title">
                <a href="/49dc4032c5df.html" class="title">Z哥25-01-24直播</a>
              </p>
              <p class="item-date">
                <time datetime="2025-01-25T12:05:29.000Z" itemprop="datePublished">2025-01-25</time>
              </p>
            </div>
          </li>
          
      </ul>
    </div>
  </div>
  

    
  </div>
</aside>

  
  
<main class="main" role="main">
  <div class="content">
  <article id="post-谷歌学术爬虫" class="article article-type-post" itemscope itemtype="http://schema.org/BlogPosting">
    
    <div class="article-header">
      
        
  
    <h1 class="article-title" itemprop="name">
      谷歌学术爬虫
    </h1>
  

      
      <div class="article-meta">
        <span class="article-date">
    <i class="icon icon-calendar-check"></i>
	<a href="/083653212f47.html" class="article-date">
	  <time datetime="2023-01-27T09:27:35.000Z" itemprop="datePublished">2023-01-27</time>
	</a>
</span>
        
  <span class="article-category">
    <i class="icon icon-folder"></i>
    <a class="article-category-link" href="/categories/tools-%E7%88%AC%E8%99%AB/">tools - 爬虫</a>
  </span>

        
  <span class="article-tag">
    <i class="icon icon-tags"></i>
	<a class="article-tag-link-link" href="/tags/python/" rel="tag">python</a>, <a class="article-tag-link-link" href="/tags/%E7%88%AC%E8%99%AB/" rel="tag">爬虫</a>, <a class="article-tag-link-link" href="/tags/%E8%B0%B7%E6%AD%8C%E5%AD%A6%E6%9C%AF/" rel="tag">谷歌学术</a>
  </span>


        

        <span class="post-comment"><i class="icon icon-comment"></i> <a href="/083653212f47.html#comments" class="article-comment-link">评论</a></span>
        
      </div>
    </div>
    <div class="article-entry marked-body" itemprop="articleBody">
      
        <p>一个针对谷歌学术(Google scholar)的爬虫，需要科学上网。<br>支持根据关键词搜索相关的论文前N篇论文，获取论文的主要信息信息。</p>
<h2 id="根据关键词搜索"><a href="#根据关键词搜索" class="headerlink" title="根据关键词搜索"></a>根据关键词搜索</h2><p><img src="https://img-blog.csdnimg.cn/c916f42f39694b9eb19c634861784498.png" alt="网页截图"></p>
<p>可以指定：关键词、开始时间、结束时间、返回论文的数量（建议不超过200，否则容易被封），爬取的结果包括： [论文标题, 引用数, 发表时间及机构缩写, 论文链接]，见上图划线的部分<br>结果会print出来，同时也会自动保存到一个excel文件</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"><span class="keyword">import</span> urllib.request</span><br><span class="line"><span class="keyword">import</span> re</span><br><span class="line"><span class="keyword">import</span> time</span><br><span class="line"><span class="keyword">import</span> traceback</span><br><span class="line"><span class="keyword">import</span> pandas <span class="keyword">as</span> pd</span><br><span class="line"><span class="keyword">import</span> warnings</span><br><span class="line"></span><br><span class="line">warnings.filterwarnings(<span class="string">&quot;ignore&quot;</span>)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line">headers = &#123;<span class="string">&#x27;User-Agent&#x27;</span>: <span class="string">&#x27;Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) &#x27;</span></span><br><span class="line">                             <span class="string">&#x27;Chrome/90.0.4430.93 Safari/537.36&#x27;</span>&#125;</span><br><span class="line"></span><br><span class="line"><span class="keyword">def</span> <span class="title function_">get_paper_page</span>(<span class="params">url</span>):</span><br><span class="line">    req = urllib.request.Request(url=url, headers=headers)</span><br><span class="line">    res = urllib.request.urlopen(req, timeout=<span class="number">100</span>)</span><br><span class="line">    html=res.read().decode(<span class="string">&#x27;utf-8&#x27;</span>)</span><br><span class="line">    soup=BeautifulSoup(html)</span><br><span class="line">    data = [[div.select(<span class="string">&#x27;.gs_rt &gt; a&#x27;</span>)[<span class="number">0</span>].text, div.select(<span class="string">&#x27;.gs_fl &gt; a&#x27;</span>)[<span class="number">2</span>].string, re.search(<span class="string">&quot;- .*?\&lt;/div&gt;&quot;</span>, <span class="built_in">str</span>(div.select(<span class="string">&#x27;.gs_a&#x27;</span>)[<span class="number">0</span>])).group()[<span class="number">1</span>:-<span class="number">6</span>].replace(<span class="string">&quot;\xa0&quot;</span>, <span class="string">&quot;&quot;</span>), div.select(<span class="string">&#x27;.gs_rt &gt; a&#x27;</span>)[<span class="number">0</span>][<span class="string">&quot;href&quot;</span>]] <span class="keyword">for</span> div <span class="keyword">in</span> soup.select(<span class="string">&#x27;.gs_ri&#x27;</span>)]</span><br><span class="line">    data = [[x[<span class="number">0</span>], <span class="built_in">int</span>(x[<span class="number">1</span>][<span class="number">6</span>:]) <span class="keyword">if</span> x[<span class="number">1</span>] != <span class="literal">None</span> <span class="keyword">and</span> x[<span class="number">1</span>].startswith(<span class="string">&quot;被引用次数&quot;</span>) <span class="keyword">else</span> <span class="number">0</span>, x[<span class="number">2</span>], x[<span class="number">3</span>]] <span class="keyword">for</span> x <span class="keyword">in</span> data]</span><br><span class="line">    <span class="keyword">return</span> data</span><br><span class="line"></span><br><span class="line"><span class="keyword">def</span> <span class="title function_">save_paper_list</span>(<span class="params">data, file_name</span>):</span><br><span class="line">    data = pd.DataFrame(data, columns=[<span class="string">&#x27;paper title&#x27;</span>, <span class="string">&#x27;reference&#x27;</span>, <span class="string">&#x27;publish info&#x27;</span>, <span class="string">&#x27;url&#x27;</span>])</span><br><span class="line">    writer = pd.ExcelWriter(file_name)</span><br><span class="line">    data.to_excel(writer, index=<span class="literal">False</span>, encoding=<span class="string">&#x27;utf-8&#x27;</span>, sheet_name=<span class="string">&#x27;Sheet1&#x27;</span>)</span><br><span class="line">    writer.save()</span><br><span class="line">    writer.close()</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="keyword">def</span> <span class="title function_">get_paper_list_by_keywork</span>(<span class="params">keyword, start_year = <span class="literal">None</span>, end_year = <span class="literal">None</span>, max_capacity = <span class="number">100</span>, debug_mode = <span class="literal">False</span>, save_file = <span class="string">&quot;paper_list.xlsx&quot;</span>, retry_times = <span class="number">3</span></span>):</span><br><span class="line">    keyword = re.sub(<span class="string">&quot; +&quot;</span>, <span class="string">&quot;+&quot;</span>, keyword.strip())</span><br><span class="line">    url_base = <span class="string">&#x27;https://scholar.google.com/scholar?hl=zh-CN&amp;as_sdt=0%2C5&#x27;</span></span><br><span class="line"></span><br><span class="line">    url_base = url_base + <span class="string">&#x27;&amp;q=&#x27;</span> + keyword</span><br><span class="line">    <span class="keyword">if</span> start_year != <span class="literal">None</span>:</span><br><span class="line">        url_base += <span class="string">&quot;&amp;as_ylo=&quot;</span> + <span class="built_in">str</span>(start_year)</span><br><span class="line">    <span class="keyword">if</span> end_year != <span class="literal">None</span>:</span><br><span class="line">        url_base += <span class="string">&quot;&amp;as_yhi=&quot;</span> + <span class="built_in">str</span>(end_year)</span><br><span class="line"></span><br><span class="line">    start = <span class="number">0</span></span><br><span class="line">    data = []</span><br><span class="line">    <span class="keyword">while</span> start &lt; max_capacity:</span><br><span class="line">        url = url_base + <span class="string">&quot;&amp;start=&quot;</span> + <span class="built_in">str</span>(start)</span><br><span class="line">        start += <span class="number">10</span></span><br><span class="line">        <span class="built_in">print</span>(url)</span><br><span class="line">        <span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(retry_times):</span><br><span class="line">            <span class="keyword">try</span>:</span><br><span class="line">                data.extend(get_paper_page(url))</span><br><span class="line">                <span class="keyword">break</span></span><br><span class="line">            <span class="keyword">except</span> Exception <span class="keyword">as</span> e:</span><br><span class="line">                <span class="keyword">if</span> i &lt; retry_times -<span class="number">1</span>:</span><br><span class="line">                    <span class="built_in">print</span>(<span class="string">&quot;error, retrying ... &quot;</span>)</span><br><span class="line">                <span class="keyword">else</span>:</span><br><span class="line">                    <span class="built_in">print</span>(<span class="string">&quot;error, fail to get &quot;</span>, url)</span><br><span class="line">                <span class="keyword">if</span> debug_mode:</span><br><span class="line">                    traceback.print_exc()</span><br><span class="line">                time.sleep(<span class="number">20</span>)</span><br><span class="line">        time.sleep(<span class="number">10</span>)</span><br><span class="line">    <span class="comment"># data: [论文标题, 引用数, 发表时间及机构缩写, 论文链接]</span></span><br><span class="line">    <span class="built_in">print</span>(data)</span><br><span class="line">    save_paper_list(data, save_file)</span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">&quot;__main__&quot;</span>:</span><br><span class="line">    get_paper_list_by_keywork(<span class="string">&quot; named entity recognition &quot;</span>, start_year=<span class="number">2020</span>, max_capacity=<span class="number">100</span>, debug_mode=<span class="literal">False</span>, save_file = <span class="string">&quot;paper_list.xlsx&quot;</span>)</span><br><span class="line">    <span class="built_in">print</span>(<span class="string">&quot;end&quot;</span>)</span><br></pre></td></tr></table></figure>

<p>爬取结果如下图所示：</p>
<p><img src="/images/paper%20list.png" alt="结果截图"></p>

      
    </div>
    <div class="article-footer">
      <blockquote class="mt-2x">
  <ul class="post-copyright list-unstyled">
    
    <li class="post-copyright-link hidden-xs">
      <strong>本文链接：</strong>
      <a href="http://chadqiu.github.io/083653212f47.html" title="谷歌学术爬虫" target="_blank" rel="external">http://chadqiu.github.io/083653212f47.html</a>
    </li>
    
    <li class="post-copyright-license">
      <strong>版权声明： </strong> 本博客所有文章除特别声明外，均采用 <a href="http://creativecommons.org/licenses/by/4.0/deed.zh" target="_blank" rel="external">CC BY 4.0 CN协议</a> 许可协议。转载请注明出处！
    </li>
  </ul>
</blockquote>


<div class="panel panel-default panel-badger">
  <div class="panel-body">
    <figure class="media">
      <div class="media-left">
        <a href="https://github.com/chadqiu" target="_blank" class="img-burn thumb-sm visible-lg">
          <img src="/images/logo.jpg" class="img-rounded w-full" alt="">
        </a>
      </div>
      <div class="media-body">
        <h3 class="media-heading"><a href="https://github.com/chadqiu" target="_blank"><span class="text-dark">chadqiu</span><small class="ml-1x">Developer &amp; Researcher</small></a></h3>
        <div>在校程序猿</div>
      </div>
    </figure>
  </div>
</div>


    </div>
  </article>
  
    
  <section id="comments">
  	
      <div id="lv-container" data-id="city" data-uid="MTAyMC81Nzk4OC8zNDQ1MQ==">
        <noscript> 为正常使用来必力评论功能请激活JavaScript</noscript>
      </div>    
    
  </section>


  
</div>

  <nav class="bar bar-footer clearfix" data-stick-bottom>
  <div class="bar-inner">
  
  <ul class="pager pull-left">
    
    <li class="prev">
      <a href="/79bdc4d1f745.html" title="二分搜索"><i class="icon icon-angle-left" aria-hidden="true"></i><span>&nbsp;&nbsp;上一篇</span></a>
    </li>
    
    
    <li class="next">
      <a href="/f06a19b2ce94.html" title="牛客网爬虫"><span>下一篇&nbsp;&nbsp;</span><i class="icon icon-angle-right" aria-hidden="true"></i></a>
    </li>
    
    
  </ul>
  
  
  
  <div class="bar-right">
    
  </div>
  </div>
</nav>
  


</main>

  <footer class="footer" itemscope itemtype="http://schema.org/WPFooter">
	
	
    <ul class="social-links">
    	
        <li><a href="https://github.com/chadqiu" target="_blank" title="Github" data-toggle=tooltip data-placement=top><i class="icon icon-github"></i></a></li>
        
        <li><a href="/atom.xml" target="_blank" title="Rss" data-toggle=tooltip data-placement=top><i class="icon icon-rss"></i></a></li>
        
    </ul>

    <div class="copyright">
    	
        <div class="publishby">
        	Theme by <a href="https://github.com/cofess" target="_blank"> cofess </a>base on <a href="https://github.com/cofess/hexo-theme-pure" target="_blank">pure</a>.
        </div>
    </div>
</footer>
  <script src="//cdn.jsdelivr.net/npm/jquery@1.12.4/dist/jquery.min.js"></script>
<script>
window.jQuery || document.write('<script src="js/jquery.min.js"><\/script>')
</script>

<script src="/js/plugin.min.js"></script>


<script src="/js/application.js"></script>


    <script>
(function (window) {
    var INSIGHT_CONFIG = {
        TRANSLATION: {
            POSTS: '文章',
            PAGES: '页面',
            CATEGORIES: '分类',
            TAGS: '标签',
            UNTITLED: '(未命名)',
        },
        ROOT_URL: '/',
        CONTENT_URL: '/content.json',
    };
    window.INSIGHT_CONFIG = INSIGHT_CONFIG;
})(window);
</script>

<script src="/js/insight.js"></script>






   




   
    
<script defer type="text/javascript">
  (function(d, s) {
    var j, e = d.getElementsByTagName(s)[0];

    if (typeof LivereTower === 'function') { return; }

    j = d.createElement(s);
    j.src = 'https://cdn-city.livere.com/js/embed.dist.js';
    j.async = true;

    e.parentNode.insertBefore(j, e);
  })(document, 'script');
</script>








</body>
</html>